Let's dig into non parametric testing, starting with why it is even needed in the first place.
import pandas as pd
import numpy as np
from scipy.stats import bernoulli, binom, norm, ks_2samp, ttest_ind
from scipy import integrate
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rc, animation
from IPython.core.display import display, HTML
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.ticker import LinearLocator, FormatStrFormatter
from _plotly_future_ import v4_subplots
from plotly import tools
import plotly
import plotly.graph_objs as go
from plotly.offline import iplot
import plotly.figure_factory as ff
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
sns.set(style="white", palette="husl")
sns.set_context("talk")
sns.set_style("ticks")
sample_size = 200
mean1 = 0
mean2 = 1
var1 = 1
var2 = 1
samp1 = np.random.normal(mean1, var1, sample_size)
samp2 = np.random.normal(mean2, var2, sample_size)
trace1a = go.Histogram(
x=samp1,
nbinsx=50,
name="Sample 1",
marker_color='blue',
opacity=0.7
)
trace2a = go.Histogram(
x=samp2,
nbinsx=50,
name="Sample 2",
marker_color='crimson',
opacity=0.7
)
x_axis = np.arange(-3, 4, 0.01)
y_samp1_pdf = norm.pdf(x_axis, mean1, var1)
y_samp2_pdf = norm.pdf(x_axis, mean2, var2)
y_samp1_cdf = norm.cdf(x_axis, mean1, var1)
y_samp2_cdf = norm.cdf(x_axis, mean2, var2)
trace1b = go.Scatter(
x=x_axis,
y=y_samp1_pdf,
marker = dict(color='blue'),
fill='tozeroy',
fillcolor='rgba(0, 0, 255, 0.2)',
name="Sample 1",
showlegend=False
)
trace2b = go.Scatter(
x=x_axis,
y=y_samp2_pdf,
marker = dict(color='red'),
fill='tozeroy',
fillcolor='rgba(255, 0, 0, 0.2)',
name="Sample 2",
showlegend=False
)
fig = plotly.subplots.make_subplots(
rows=1,
cols=2,
print_grid=False,
subplot_titles=('Sample 1 & 2 Histograms', 'Sample 1 & 2 PDFs')
)
fig.append_trace(trace1a, 1, 1)
fig.append_trace(trace2a, 1, 1)
fig.append_trace(trace1b, 1, 2)
fig.append_trace(trace2b, 1, 2)
fig.update_traces(marker=dict(line=dict(width=1, color='black')))
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#f1f1f1')
layout = go.Layout(
barmode='overlay',
width=950,
height=450,
xaxis1=dict(title="X"),
yaxis=dict(title="Count"),
xaxis2=dict(title='X'),
paper_bgcolor='rgba(0,0,0,0)',
plot_bgcolor='rgba(0,0,0,0)'
)
fig.update_layout(
layout
)
plotly.offline.iplot(fig)
# html_fig = plotly.io.to_html(fig, include_plotlyjs=True)
# display(HTML(html_fig))
We can perform a traditional t test to determine if the samples are drawn from different distributions:
t2, p2 = ttest_ind(samp1, samp2)
display(f"t: {t2}, p: {p2}")
The t test was able to nicely identify that these samples were in fact drawn from different distributions. For all of the detail behind the inner workings of the t test, I recommend checking out my post on statistical inference.
sample_size = 200
mean1 = 1
mean2 = 1
var1 = 1
var2 = 5
samp1 = np.random.normal(mean1, var1, sample_size)
samp2 = np.random.normal(mean2, var2, sample_size)
trace1a = go.Histogram(
x=samp1,
nbinsx=20,
name="Sample 1",
marker_color='blue',
opacity=0.7
)
trace2a = go.Histogram(
x=samp2,
nbinsx=50,
name="Sample 2",
marker_color='crimson',
opacity=0.7
)
x_axis = np.arange(-15, 15, 0.01)
y_samp1_pdf = norm.pdf(x_axis, mean1, var1)
y_samp2_pdf = norm.pdf(x_axis, mean2, var2)
y_samp1_cdf = norm.cdf(x_axis, mean1, var1)
y_samp2_cdf = norm.cdf(x_axis, mean2, var2)
trace1b = go.Scatter(
x=x_axis,
y=y_samp1_pdf,
marker = dict(color='blue'),
fill='tozeroy',
fillcolor='rgba(0, 0, 255, 0.2)',
name="Sample 1",
showlegend=False
)
trace2b = go.Scatter(
x=x_axis,
y=y_samp2_pdf,
marker = dict(color='red'),
fill='tozeroy',
fillcolor='rgba(255, 0, 0, 0.2)',
name="Sample 2",
showlegend=False
)
fig = plotly.subplots.make_subplots(
rows=1,
cols=2,
print_grid=False,
subplot_titles=('Sample 1 & 2 Histograms', 'Sample 1 & 2 PDFs')
)
fig.append_trace(trace1a, 1, 1)
fig.append_trace(trace2a, 1, 1)
fig.append_trace(trace1b, 1, 2)
fig.append_trace(trace2b, 1, 2)
fig.update_traces(marker=dict(line=dict(width=1, color='black')))
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#f1f1f1')
layout = go.Layout(
barmode='overlay',
width=950,
height=450,
xaxis1=dict(title="X"),
yaxis=dict(title="Count"),
xaxis2=dict(title='X'),
yaxis2=dict(title="Probability Density"),
paper_bgcolor='rgba(0,0,0,0)',
plot_bgcolor='rgba(0,0,0,0)'
)
fig.update_layout(
layout
)
plotly.offline.iplot(fig)
# html_fig = plotly.io.to_html(fig, include_plotlyjs=True)
# display(HTML(html_fig))
How does the t test perform in this case? Very poorly:
t2, p2 = ttest_ind(samp1, samp2)
display(f"t: {t2}, p: {p2}")
It incorrectly concludes that there is no meaningful difference between these two samples. Now, what could be a way to identify this difference in variance? Recall the cumulative distribution function:
trace1a = go.Histogram(
x=samp1,
nbinsx=20,
name="Sample 1",
marker_color='blue',
opacity=0.7,
cumulative_enabled=True,
showlegend=False
)
trace2a = go.Histogram(
x=samp2,
nbinsx=20,
name="Sample 2",
marker_color='crimson',
opacity=0.7,
cumulative_enabled=True,
showlegend=False
)
x_axis = np.arange(-15, 15, 0.01)
y_samp1_cdf = norm.cdf(x_axis, mean1, var1)
y_samp2_cdf = norm.cdf(x_axis, mean2, var2)
trace1b = go.Scatter(
x=x_axis,
y=y_samp1_cdf,
marker = dict(color='blue'),
name="Sample 1",
showlegend=False
)
trace2b = go.Scatter(
x=x_axis,
y=y_samp2_cdf,
marker = dict(color='red'),
name="Sample 2",
showlegend=False
)
fig = plotly.subplots.make_subplots(
rows=1,
cols=2,
print_grid=False,
subplot_titles=('Sample 1 & 2 Cumulative Histograms', 'Sample 1 & 2 Cumulative Distribution Functions')
)
fig.append_trace(trace1a, 1, 1)
fig.append_trace(trace2a, 1, 1)
fig.append_trace(trace1b, 1, 2)
fig.append_trace(trace2b, 1, 2)
fig.update_traces(marker=dict(line=dict(width=1, color='black')))
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#f1f1f1')
layout = go.Layout(
barmode='overlay',
width=950,
height=450,
xaxis1=dict(title="X"),
yaxis=dict(title="Count"),
xaxis2=dict(title='X'),
yaxis2=dict(title=f'$P(X \leq x)$'),
paper_bgcolor='rgba(0,0,0,0)',
plot_bgcolor='rgba(0,0,0,0)'
)
fig.update_layout(
layout
)
plotly.offline.iplot(fig)
# html_fig = plotly.io.to_html(fig, include_plotlyjs=True)
# display(HTML(html_fig))
There is clearly a noticable difference in the CDFs! That is where the KS score comes into play. It tries to find the maximum distance between the empirical CDFs. We can look at the empirical cdfs with the help of this function:
def empirical_cdf(x):
x.sort()
return x, np.arange(0, x.shape[0], 1) / (x.shape[0] - 1)
Take a look below:
def empirical_cdf(x):
x.sort()
return x, np.arange(0, x.shape[0], 1) / (x.shape[0] - 1)
x_axis_samp1, emp_cdf_samp1 = empirical_cdf(samp1)
x_axis_samp2, emp_cdf_samp2 = empirical_cdf(samp2)
trace1 = go.Scatter(
x=x_axis_samp1,
y=emp_cdf_samp1,
marker = dict(color='blue'),
name="Sample 1"
)
trace2 = go.Scatter(
x=x_axis_samp2,
y=emp_cdf_samp2,
marker = dict(color='red'),
name="Sample 2"
)
data = [trace1, trace2]
layout = go.Layout(
width=700,
height=400,
paper_bgcolor='rgba(0,0,0,0)',
plot_bgcolor='rgba(0,0,0,0)',
title="Empirical CDF",
xaxis=dict(title="X"),
yaxis=dict(title="Cumulative Probability")
)
fig = go.Figure(data=data, layout=layout)
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#f1f1f1')
plotly.offline.iplot(fig)
# html_fig = plotly.io.to_html(fig, include_plotlyjs=True)
# display(HTML(html_fig))
Now, the KS score will find the maximum difference in height between the two curves above. This can be seen in the plot below:
def ks_score_implementation(x1, x2):
# Sort and combine all data into total input space
data1 = np.sort(x1)
data2 = np.sort(x2)
data_all = np.concatenate([data1, data2])
# Determine 'cdf' of each sample, based on entire input space. Note, this is not a traditional CDF.
# cdf1_data and cdf2_data simply are arrays that hold the value of F_hat based on identical inputs
# from data_all
cdf1_data = empirical_dist_helper(data1, data_all)
cdf2_data = empirical_dist_helper(data2, data_all)
cdf_diffs = cdf1_data - cdf2_data
minS = -np.min(cdf_diffs)
maxS = np.max(cdf_diffs)
d = max(minS, maxS)
# This block is used only to return the cdf points that yielded the highest KS Score. Used for plotting.
arg_min_S = -np.argmin(cdf_diffs)
arg_max_S = np.argmax(cdf_diffs)
d_idx = max(arg_min_S, arg_max_S)
max_distance_points = ((data_all[d_idx], cdf1_data[d_idx]), (data_all[d_idx], cdf2_data[d_idx]))
return d, max_distance_points
def empirical_dist_helper(sample, t):
n = sample.shape[0]
return np.searchsorted(sample, t, side='right') / n
x_axis_samp1, emp_cdf_samp1 = empirical_cdf(samp1)
x_axis_samp2, emp_cdf_samp2 = empirical_cdf(samp2)
trace1 = go.Scatter(
x=x_axis_samp1,
y=emp_cdf_samp1,
marker = dict(color='blue'),
name="Sample 1"
)
trace2 = go.Scatter(
x=x_axis_samp2,
y=emp_cdf_samp2,
marker = dict(color='red'),
name="Sample 2"
)
# Find points that yield largest distance (and hence highest KS Score), plot vertical line
_, ks_points = ks_score_implementation(samp1, samp2)
ks_point1, ks_point2 = ks_points
trace3 = go.Scatter(
x=[ks_point1[0]],
y=[ks_point1[1]],
marker = dict(
color = 'green',
size=6
),
mode="markers",
showlegend=False
)
trace4 = go.Scatter(
x=[ks_point2[0]],
y=[ks_point2[1]],
marker = dict(
color = 'green',
size=6
),
mode="markers",
showlegend=False
)
trace5 = go.Scatter(
x=[ks_point1[0], ks_point2[0]],
y=[ks_point1[1], ks_point2[1]],
mode='lines',
marker = dict(
color = 'green',
size=6
),
name="KS score (max distance <br>between empirical cdfs)"
)
data = [trace1, trace2, trace3, trace4, trace5]
layout = go.Layout(
width=700,
height=400,
paper_bgcolor='rgba(0,0,0,0)',
plot_bgcolor='rgba(0,0,0,0)',
title="Empirical CDF",
xaxis=dict(title="X"),
yaxis=dict(title="Cumulative Probability")
)
fig = go.Figure(data=data, layout=layout)
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#f1f1f1')
plotly.offline.iplot(fig)
# html_fig = plotly.io.to_html(fig, include_plotlyjs=True)
# display(HTML(html_fig))
This distance is then used to determine the KS score, along with a p value relating to if the samples are indeed drawn from different populations. How does the KS score perform here?
display(ks_2samp(samp1, samp2))
It performs very well! It easily identifies that these samples are in fact drawn from different distributions.
Let's look at another example; assume that one of our samples is drawn from a bimodal distribution.
sample_size = 1000
mean1a = -1
mean1b = 3
var1a = 1
var1b = 1
gaussian1a = np.random.normal(mean1a, var1a, sample_size)
gaussian1b = np.random.normal(mean1b, var1b, sample_size)
samp1 = np.concatenate((gaussian1a, gaussian1b))
mean2 = 1
var2 = 1
samp2 = np.random.normal(mean2, var2, sample_size)
trace1a = go.Histogram(
x=samp1,
nbinsx=50,
name="Sample 1",
marker_color='blue',
opacity=0.7
)
trace2a = go.Histogram(
x=samp2,
nbinsx=25,
name="Sample 2",
marker_color='crimson',
opacity=0.7
)
trace1b = go.Histogram(
x=samp1,
nbinsx=20,
name="Sample 1",
marker_color='blue',
opacity=0.7,
cumulative_enabled=True,
showlegend=False
)
trace2b = go.Histogram(
x=samp2,
nbinsx=20,
name="Sample 2",
marker_color='crimson',
opacity=0.7,
cumulative_enabled=True,
showlegend=False
)
fig = plotly.subplots.make_subplots(
rows=1,
cols=2,
print_grid=False,
subplot_titles=('Sample 1 & 2 Histograms', 'Sample 1 & 2 Cumulative Histograms')
)
fig.append_trace(trace1a, 1, 1)
fig.append_trace(trace2a, 1, 1)
fig.append_trace(trace1b, 1, 2)
fig.append_trace(trace2b, 1, 2)
fig.update_traces(marker=dict(line=dict(width=1, color='black')))
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#f1f1f1')
layout = go.Layout(
barmode='overlay',
width=950,
height=450,
xaxis1=dict(title="X"),
yaxis=dict(title="Count"),
xaxis2=dict(title='X'),
paper_bgcolor='rgba(0,0,0,0)',
plot_bgcolor='rgba(0,0,0,0)'
)
fig.update_layout(
layout
)
plotly.offline.iplot(fig)
# html_fig = plotly.io.to_html(fig, include_plotlyjs=True)
# display(HTML(html_fig))
However, both samples still have approximately the same over all mean of 1:
display(f'Mean of Sample 1: {samp1.mean()}' )
display(f'Mean of Sample 2: {samp2.mean()}')
But again, we can clearly see that they are indeed drawn from different populations, via the PDF and CDFs below:
x_axis = np.arange(-15, 15, 0.01)
y_samp1_pdf_func = lambda x: 0.5 * norm.pdf(x, mean1a, var1a) + 0.5 * norm.pdf(x, mean1b, var1b)
y_samp1_cdf_func = lambda x: 0.5 * norm.cdf(x, mean1a, var1a) + 0.5 * norm.cdf(x, mean1b, var1b)
y_samp1_pdf = y_samp1_pdf_func(x_axis)
y_samp1_cdf = y_samp1_cdf_func(x_axis)
# Assert that our distribution has been normalized correctly (i.e. the area sums to 1)
assert round(integrate.quad(y_samp1_pdf_func, -np.inf, np.inf)[0] - 1, 8) == 0.0
y_samp2_pdf = norm.pdf(x_axis, mean2, var2)
y_samp2_cdf = norm.cdf(x_axis, mean2, var2)
trace1a = go.Scatter(
x=x_axis,
y=y_samp1_pdf,
marker = dict(color='blue'),
fill='tozeroy',
fillcolor='rgba(0, 0, 255, 0.2)',
name="Sample 1"
)
trace2a = go.Scatter(
x=x_axis,
y=y_samp2_pdf,
marker = dict(color='red'),
fill='tozeroy',
fillcolor='rgba(255, 0, 0, 0.2)',
name="Sample 2"
)
trace1b = go.Scatter(
x=x_axis,
y=y_samp1_cdf,
marker = dict(color='blue'),
name="Sample 1",
showlegend=False
)
trace2b = go.Scatter(
x=x_axis,
y=y_samp2_cdf,
marker = dict(color='red'),
name="Sample 2",
showlegend=False
)
fig = plotly.subplots.make_subplots(
rows=1,
cols=2,
print_grid=False,
subplot_titles=('Sample 1 & 2 PDFs', 'Sample 1 & 2 CDFs')
)
fig.append_trace(trace1a, 1, 1)
fig.append_trace(trace2a, 1, 1)
fig.append_trace(trace1b, 1, 2)
fig.append_trace(trace2b, 1, 2)
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#f1f1f1')
layout = go.Layout(
width=950,
height=450,
xaxis1=dict(title="X"),
yaxis1=dict(title="Probability Density"),
xaxis2=dict(title='X'),
yaxis2=dict(title="Cumulative Probability"),
paper_bgcolor='rgba(0,0,0,0)',
plot_bgcolor='rgba(0,0,0,0)'
)
fig.update_layout(
layout
)
plotly.offline.iplot(fig)
# html_fig = plotly.io.to_html(fig, include_plotlyjs=True)
# display(HTML(html_fig))
So, how does the t test perform here?
t2, p2 = ttest_ind(samp1, samp2)
display(f"t: {t2}, p: {p2}")
Again, it fails to distinguish the differences between our two distributions (since they have the same mean!). Let's check the KS Score:
ks_2samp(samp1, samp2)
Again the KS score is able to confidently distinguish the difference between the two samples. Visually it looks like:
x_axis_samp1, emp_cdf_samp1 = empirical_cdf(samp1)
x_axis_samp2, emp_cdf_samp2 = empirical_cdf(samp2)
trace1 = go.Scatter(
x=x_axis_samp1,
y=emp_cdf_samp1,
marker = dict(color='blue'),
name="Sample 1"
)
trace2 = go.Scatter(
x=x_axis_samp2,
y=emp_cdf_samp2,
marker = dict(color='red'),
name="Sample 2"
)
# Find points that yield largest distance (and hence highest KS Score), plot vertical line
_, ks_points = ks_score_implementation(samp1, samp2)
ks_point1, ks_point2 = ks_points
trace3 = go.Scatter(
x=[ks_point1[0]],
y=[ks_point1[1]],
marker = dict(
color = 'green',
size=6
),
mode="markers",
showlegend=False
)
trace4 = go.Scatter(
x=[ks_point2[0]],
y=[ks_point2[1]],
marker = dict(
color = 'green',
size=6
),
mode="markers",
showlegend=False
)
trace5 = go.Scatter(
x=[ks_point1[0], ks_point2[0]],
y=[ks_point1[1], ks_point2[1]],
mode='lines',
marker = dict(
color = 'green',
size=6
),
name="KS score (max distance <br>between empirical cdfs)"
)
data = [trace1, trace2, trace3, trace4, trace5]
layout = go.Layout(
width=700,
height=400,
paper_bgcolor='rgba(0,0,0,0)',
plot_bgcolor='rgba(0,0,0,0)',
title="Empirical CDF",
xaxis=dict(title="X"),
yaxis=dict(title="Cumulative Probability")
)
fig = go.Figure(data=data, layout=layout)
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#f1f1f1')
plotly.offline.iplot(fig)
# html_fig = plotly.io.to_html(fig, include_plotlyjs=True)
# display(HTML(html_fig))
Now that we have taken the time to motivate the need behind the KS score, let's take a look under the hood to see how it actually is implemented.
We can do this via an example. To start, we can generate two samples that are indeed from two separate distributions; we will sample from a gaussian that has a mean of 0 and variance of 1, and a second gaussian that has a mean of 1 and variance of 1. We will call these two data generating distributions group 1 and group 2 respectively. These population distributions are shown below:
sample_size = 200
mean1 = 0
mean2 = 1
var1 = 1
var2 = 1
x_axis = np.arange(-3, 4, 0.01)
y_samp1_pdf = norm.pdf(x_axis, mean1, var1)
y_samp2_pdf = norm.pdf(x_axis, mean2, var2)
y_samp1_cdf = norm.cdf(x_axis, mean1, var1)
y_samp2_cdf = norm.cdf(x_axis, mean2, var2)
trace1a = go.Scatter(
x=x_axis,
y=y_samp1_pdf,
marker = dict(color='blue'),
fill='tozeroy',
fillcolor='rgba(0, 0, 255, 0.2)',
name="Group 1"
)
trace2a = go.Scatter(
x=x_axis,
y=y_samp2_pdf,
marker = dict(color='crimson'),
fill='tozeroy',
fillcolor='rgba(255, 0, 0, 0.2)',
name="Group 2"
)
trace1b = go.Scatter(
x=x_axis,
y=y_samp1_cdf,
marker = dict(color='blue'),
name="",
showlegend=False
)
trace2b = go.Scatter(
x=x_axis,
y=y_samp2_cdf,
marker = dict(color='crimson'),
name="",
showlegend=False
)
fig = plotly.subplots.make_subplots(
rows=1,
cols=2,
print_grid=False,
subplot_titles=('Group 1 & 2 PDFs', 'Group 1 & 2 CDFs')
)
fig.append_trace(trace1a, 1, 1)
fig.append_trace(trace2a, 1, 1)
fig.append_trace(trace1b, 1, 2)
fig.append_trace(trace2b, 1, 2)
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#f1f1f1')
layout = go.Layout(
width=950,
height=450,
xaxis1=dict(title="X"),
yaxis1=dict(title="Probability Density"),
xaxis2=dict(title='X'),
yaxis2=dict(title="Cumulative Probability"),
paper_bgcolor='rgba(0,0,0,0)',
plot_bgcolor='rgba(0,0,0,0)'
)
fig.update_layout(
layout
)
plotly.offline.iplot(fig)
# html_fig = plotly.io.to_html(fig, include_plotlyjs=True)
# display(HTML(html_fig))
Now we can sample 200 data points from each group and plot the resulting histograms:
samp1 = np.random.normal(mean1, var1, sample_size)
samp2 = np.random.normal(mean2, var2, sample_size)
trace1a = go.Histogram(
x=samp1,
nbinsx=20,
name="Sample 1",
marker_color='blue',
opacity=0.7
)
trace2a = go.Histogram(
x=samp2,
nbinsx=20,
name="Sample 2",
marker_color='crimson',
opacity=0.7
)
trace1b = go.Histogram(
x=samp1,
nbinsx=20,
name="Sample 1",
marker_color='blue',
opacity=0.7,
cumulative_enabled=True,
showlegend=False
)
trace2b = go.Histogram(
x=samp2,
nbinsx=20,
name="Sample 2",
marker_color='crimson',
opacity=0.7,
cumulative_enabled=True,
showlegend=False
)
fig = plotly.subplots.make_subplots(
rows=1,
cols=2,
print_grid=False,
subplot_titles=('Sample 1 & 2 Histograms', 'Sample 1 & 2 Cumulative Histograms')
)
fig.append_trace(trace1a, 1, 1)
fig.append_trace(trace2a, 1, 1)
fig.append_trace(trace1b, 1, 2)
fig.append_trace(trace2b, 1, 2)
fig.update_traces(marker=dict(line=dict(width=1, color='black')))
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#f1f1f1')
layout = go.Layout(
barmode='overlay',
width=950,
height=450,
xaxis1=dict(title="X"),
yaxis=dict(title="Count"),
xaxis2=dict(title='X'),
paper_bgcolor='rgba(0,0,0,0)',
plot_bgcolor='rgba(0,0,0,0)'
)
fig.update_layout(
layout
)
plotly.offline.iplot(fig)
# html_fig = plotly.io.to_html(fig, include_plotlyjs=True)
# display(HTML(html_fig))
Because we know that sample 1 and sample 2 were in fact drawn from different populations (group 1 and group 2), our goal is for a hypothesis test to also come to that same conclusion. Using the out of the box KS test from scipy, we can see if it manages to classify them as being generated by different populations:
display(ks_2samp(samp1, samp2))
And as expected, it does! The question is hows does ks_2samp work under the hood? We have briefly discussed that the test works by calculating the maximum distance between the two sample's empircal CDF's. But, as we will see, that requires some thinking due to indexing issues and limitations arising from the nature of empircal CDF's.
Before we can dig into the actual ks score implementation, it is worth defining what exactly an empirical distribution function is. Simply, it is:
A cumulative distribution function that is a step function that jumps up by $\frac{1}{n}$ for each of the $n$ data points in the sample.
Mathematically, that looks like:
$$\hat{F}_n(t) = \frac{\text{number of elements in the sample } \leq t}{n} = \frac{1}{n} \sum_{i=1}^n \textbf{1}_{X_i \leq t}$$Where $\textbf{1}$ is an indicator function, and our sample contains $X_1,\dots,X_n$. It may be easier to demonstrate in code how exactly this works. Take a look at the function empirical_cdf below:
def empirical_cdf(x):
x.sort()
return x, np.arange(0, x.shape[0], 1) / (x.shape[0] - 1)
This function is an implementation of $\hat{F}$. It takes in x, our sample data points, and first sorts them. Then it creates an array that is the same size of x, but at each successive index the value is incremented by $\frac{1}{n}$. We can see how this works below:
def empirical_cdf(x):
x.sort()
return x, np.arange(0, x.shape[0], 1) / (x.shape[0] - 1)
x_axis_samp1, emp_cdf_samp1 = empirical_cdf(samp1)
x_axis_samp2, emp_cdf_samp2 = empirical_cdf(samp2)
trace1 = go.Scatter(
x=x_axis_samp1,
y=emp_cdf_samp1,
marker = dict(color='blue'),
name="Sample 1"
)
trace2 = go.Scatter(
x=x_axis_samp2,
y=emp_cdf_samp2,
marker = dict(color='red'),
name="Sample 2"
)
data = [trace1, trace2]
layout = go.Layout(
width=650,
height=400,
paper_bgcolor='rgba(0,0,0,0)',
plot_bgcolor='rgba(0,0,0,0)',
title="Empirical CDF",
xaxis=dict(title="X"),
yaxis=dict(title="Cumulative Probability")
)
fig = go.Figure(data=data, layout=layout)
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#f1f1f1')
plotly.offline.iplot(fig)
# html_fig = plotly.io.to_html(fig, include_plotlyjs=True)
# display(HTML(html_fig))
I want to take a moment to highlight a property of empirical functions and specifically how they compare to continuous functions. In general, if you have a continuous function, say $f(x) = x^3 + 2$, you can plot that function by generating an array of evenly spaced $x$ values, that are meant to represent $\mathbb{R}$ (while yes these are in fact discrete points from a computational standpoint, we are meant to view them as a continuous number line). This allows for a smooth output, and it means that the function can be reasonably represented by the equation $f(x) = x^3 + 2$. In other words, it can take in any element (data point) $x$ from set $X$, the domain of the function, and associate it with a single element $y$ from another set $Y$, the codomain of the function. It is does not need the data points nearest $x$ in order to do this.
On the other hand, consider our empirical distribution function. It is very much so defined by it's data points. We can see this since for each input value of $t$, the output value $\hat{F}(t)$ requires knowing all of the data, including the total size. There are two main consequences to this that will effect our KS score implementation:
I want to make sure the above to points are very clear, so let's start with number 1. In the above plot, it is obvious that our curves are indeed different. Generally, if we were to perform an element wise subtraction of our $y$ values (cumulative probability), we be able to find the maximum distance without a problem. However, let's see what happens when we try that now:
emp_cdf_samp1 - emp_cdf_samp2
display(emp_cdf_samp1 - emp_cdf_samp2)
We end up getting a difference of $0$ for every single element. That is clearly not what we want, but it comes back to the fact at this point our empirical distributions, emp_cdf_samp1 and emp_cdf_samp2, are offset from each other. For instance, let's look at the first 10 corresponding x values from each sample, along with their cdf's:
display(pd.DataFrame({
'sample 1 x values': x_axis_samp1[0:10],
'sample 2 x values': x_axis_samp2[0:10],
'sample 1 empirical cdf': emp_cdf_samp1[0:10],
'sample 2 empirical cdf': emp_cdf_samp2[0:10],
}))
We can see in the above table that the sample 1 $x$ values at each corresponding index do not match the sample 2 $x$ values! In order to perform a proper element wise subtraction of the empirical CDF's we need to do so for the same $x$ value. In other words, find the difference between the CDF's when $x=1$ for instance.
Again, it is really worth highlighting the fact that this issue arises in part due to how we generally plot, define, and compare functions. Say we want to compare two functions of x, y1 and y2. In general, we would create a continuous x variable that will be passed in to each function, y1 and y2. Since each function is given the exact same x array, they each will be indexed properly; this means that y1[87] and y2[87] will each represent the corresponding function output based on the value of x[87].
Because our empirical distribution function is only defined for the x values contained in the sample, and not all x values, this is not the case in the present problem!
LEAVING OFF
TODO: Explain how empirical_cdf works: https://en.wikipedia.org/wiki/Empirical_distribution_function
The problem that we run into is that our x values are not the same for each distribution! This actually comes back to the way in which distributions differ compared to standard continuous functions. A continuous distribution function in reality is an abstraction (compared to a histogram or discrete distribution) that would not occur in the real world. It is meant to capture the probability of certain x values of occuring; this probability is a function of the x values; some are more probable than others. The key here to keep in mind is that when we gather data we do not observe a nice continuous spectrum of x values. We only get a certain discrete number. Hence, in our plot above, there are many (if not all) x values that occur in sample 1 but not sample 2.
This is actually a very interesting thing to keep in mind with functions. We generally consider them as taking a continuous input, x, (i.e. uniformly distributed), and determining a response for that value. Distribution functions are different in the sense that their input, by definition, does not need to be uniformly distributed! It can take on any sort of distribution/spread. The response is (y axis) is meant to capture how probable (based on the entire input) a given input value is. In other words, for a distribution function to work successfully it must see all of the data ahead of time.
We can then abstract from these messy distributions to more theoretical ones, such as the normal. Touch on taleb here.
Now, when it comes to our implementation, we need to figure out a way to computationally compare the two above empirical CDFS. Let's look at how we may do that.
ks_2samp(samp1, samp2)
Above is the actual KS score. How may we get there? At first it may seem like we could simply subtract the two curves above, let's give that shot!
diff = emp_cdf_samp1 - emp_cdf_samp2
diff
It looks like according to that calculation our curves are identical-but based on visual inspection (and the fact that we know the generating functions) this is incorrect. Where did we go wrong? Well all we did was take the differences of the two curves y values. However, we never checked to see if the x values actually matched up (remember, we want to take the difference between the cdfs at the same point in x).
Let's see where we went wrong by looking at the index 50:
idx = 50
emp_cdf_samp1[idx]
emp_cdf_samp2[idx]
We can see that at an index of 50 our empirical cdf's contain the same value. The problem is, while we generally treat our index to be similar to our x value, in this case they do not match up! Let's take a look:
x_axis_samp1[idx]
x_axis_samp2[idx]
So in other words we just subtracted the empirical cdf of sample 1 at x = -7.616 from the empirical cdf of sample 2 at x = 0.372. Visually, what we did is shown below:
x_axis_samp1, emp_cdf_samp1 = empirical_cdf(samp1)
x_axis_samp2, emp_cdf_samp2 = empirical_cdf(samp2)
trace1 = go.Scatter(
x=x_axis_samp1,
y=emp_cdf_samp1,
marker = dict(color='blue'),
name="Sample 1"
)
trace2 = go.Scatter(
x=x_axis_samp2,
y=emp_cdf_samp2,
marker = dict(color='red'),
name="Sample 1"
)
trace3 = go.Scatter(
x=[x_axis_samp1[idx]],
y=[emp_cdf_samp1[idx]],
marker = dict(color = 'blue',size=10),
name="Sample 1, index=50",
mode='markers'
)
trace4 = go.Scatter(
x=[x_axis_samp2[idx]],
y=[emp_cdf_samp2[idx]],
marker = dict(color = 'red',size=10),
name="Sample 2, index=50",
mode='markers'
)
data = [trace1, trace2, trace3, trace4]
layout = go.Layout(
width=650,
height=400,
paper_bgcolor='rgba(0,0,0,0)',
plot_bgcolor='rgba(0,0,0,0)',
title="Empirical CDF",
xaxis=dict(title="X"),
yaxis=dict(title="Cumulative Probability")
)
fig = go.Figure(data=data, layout=layout)
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#f1f1f1')
plotly.offline.iplot(fig)
# html_fig = plotly.io.to_html(fig, include_plotlyjs=True)
# display(HTML(html_fig))
So, we clearly can see above that we simply subtracted the y values at the incorrect index. We chose a common index (50) for each, but that clearly did not match up correctly. In reality we want to chose a common x value and subtract the empirical cdfs at that point.
What is a better way that we could go about this? Well, a place to start is to realize that in reality we want to calculate:
$$\hat{F}_n(t) = \frac{\text{number of elements in the sample } \leq t}{n} = \frac{1}{n} \sum_{i=1}^n \textbf{1}_{X_i \leq t}$$Where $\textbf{1}$ is an indicator function, and our sample contains $X_1,\dots,X_n$. When I had plotted above, I have calculated $\hat{F}$ for $x \in X$.
That defines the empirical distribution. It also allows us to the see why the actual y values for the empirical CDFs above were identical; because the cumulative probability is only a function of the number of data points in the sample less than or equal to the current data point. In other words, it is a step function that jumps by $\frac{1}{n}$ at each of the $n$ data points. Now, in this empirical distribution, intuition tells us that when we have a large slow (large rise over a small run), there is a density of points in that range. Like wise a small slope corresponds to a low density of points in that range. Does this make sense? YES! Remember that the CDF is the integral of the PDF, and likewise the PDF is the derivative of the CDF! So, where the slope is steepest above we would expect a peak (mode of the distribution) and indeed this is what we see!
Now, based on this function, can we write a helper function to determine the empirical distribution value of a single input? Indeed we can!
def empirical_dist_helper(sample, t):
n = sample.shape[0]
return np.searchsorted(sample, t, side='right') / n
empirical_dist_helper(np.array([3,4,10,20]), 15)
Now that we have that helper, which utilizes np.searchsorted, how can we make the next jump and use this to compare two empirical cdfs? Well, in reality what we want to do is determine, based on all of our data (i.e. the concatenation of our samples-this defines our entire sample space), what $\hat{F}$ evaluates two for each sample. An example will make this more clear.
Let's look at the 90th $x$ point in sample1:
samp1[90]
Let's use empirical_dist_helper in order to help us find $\hat{F}$ for each sample:
x_90 = samp1[90]
empirical_dist_helper(samp1, x_90)
empirical_dist_helper(samp2, x_90)
What we did was just calculated the empirical distribution value for a given input, x_90 = -0.108..., for both samp1 and samp2.
Now, our final objective is to calculate the empirical distribution value for all possible values of our sample space! In other words, we want to take all values in samp1 and samp2 and run them through empirical_dist_helper. Thankfully np.searchsorted can take a list comparators. A final implementation will look like:
# DOCS: https://github.com/scipy/scipy/blob/v0.15.1/scipy/stats/stats.py#L4029
def ks_score_implementation(x1, x2):
# Sort and combine all data into total input space
data1 = np.sort(x1)
data2 = np.sort(x2)
data_all = np.concatenate([data1, data2])
# Determine 'cdf' of each sample, based on entire input space. Note, this is not a traditional CDF.
# cdf1_data and cdf2_data simply are arrays that hold the value of F_hat based on identical inputs
# from data_all
cdf1_data = empirical_dist_helper(data1, data_all)
cdf2_data = empirical_dist_helper(data2, data_all)
cdf_diffs = cdf1_data - cdf2_data
minS = -np.min(cdf_diffs)
maxS = np.max(cdf_diffs)
d = max(minS, maxS)
return d
d = ks_score_implementation(samp1, samp2)
d
And below I expanded the running of empirical_dist_helper across all points in our input space, data_all, but utilizing a for loop:
def ks_score_implementation_expanded(x1, x2):
# Sort and combine all data into total input space
data1 = np.sort(x1)
data2 = np.sort(x2)
data_all = np.concatenate([data1, data2])
# Expanding CDF implementation
cdf1_data = []
cdf2_data = []
for point in data_all:
cdf1_data.append(empirical_dist_helper(data1, point))
cdf2_data.append(empirical_dist_helper(data2, point))
cdf_diffs = np.asarray(cdf1_data) - np.asarray(cdf2_data)
minS = -np.min(cdf_diffs)
maxS = np.max(cdf_diffs)
d = max(minS, maxS)
return d
ks_score_implementation_expanded(samp1, samp2)
Remember, when doing the search sorted we are literally finding what the CDF value would be of each data point, across the entire data set. So, in essence we:
The key idea here is that these two CDF sample arrays are NOT traditional CDFs! They are simply lists that are meant to hold the values of cumulative probability of data points from our total data set. They are not CDFs in the traditional sense. Our goal here is not to plot-we are not creating a uniformly distributed x axis and passing that in to some function (via. np.arange, np.linspace, etc).